{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Time-domain analysis of Instagram posts"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 81,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "C:\\Users\\tania\\Documents\\SDS\\Thesis\\Sandbox\\EH_Instaphyte\n"
     ]
    }
   ],
   "source": [
    "import os\n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "import pytz\n",
    "import time\n",
    "\n",
    "from datetime import datetime\n",
    "\n",
    "CWD = os.getcwd()\n",
    "# print(CWD)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Quick frequency analysis for English Heritage quiz"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "metadata": {},
   "outputs": [],
   "source": [
    "def getYearCount(hashtag,year):\n",
    "    temp_df = pd.read_csv(\"{}.csv\".format(hashtag),usecols=['node.shortcode','node.taken_at_timestamp'])\n",
    "    temp_df['year'] = temp_df.apply(lambda row: int(datetime.utcfromtimestamp(row['node.taken_at_timestamp']).strftime('%Y')), axis=1)\n",
    "    return len(temp_df[temp_df['year']==year])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 79,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "146\tappuldurcombehouse\n",
      "387\tbylandabbey\n",
      "139\tmonkbrettonpriory\n",
      "173\twolveseycastle\n"
     ]
    }
   ],
   "source": [
    "sites = ['appuldurcombehouse','bylandabbey','monkbrettonpriory','wolveseycastle']\n",
    "print(\"\\n\".join([str(getYearCount(site,2018)) + \"\\t\" + site for site in sites]))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Obtaining unique posts across each site's relevant hashtags and locations"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Parsing ``.csv`` files of scraped data"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Reading in the hashtags, locations, and location IDs of each site:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 174,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>tags</th>\n",
       "      <th>locations</th>\n",
       "      <th>locationids</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>EH_name</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>AppuldurcombeHouse</th>\n",
       "      <td>[appuldurcombehouse, appuldurcombe]</td>\n",
       "      <td>[Appuldurcombe House]</td>\n",
       "      <td>[305532]</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>BelasKnapLongBarrow</th>\n",
       "      <td>[belasknap, belasknapp, belasknaplongbarrow, b...</td>\n",
       "      <td>[Belas Knap Long Barrow, Belas Knap]</td>\n",
       "      <td>[497373440775663, 228550224]</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>BinhamPriory</th>\n",
       "      <td>[binhampriory, binhamprioryruins, binhampriory...</td>\n",
       "      <td>[Binham Priory]</td>\n",
       "      <td>[971275658]</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>BuryStEdmundsAbbey</th>\n",
       "      <td>[burystedmundsabbeygarden, burystedmundsabbeyg...</td>\n",
       "      <td>[Abbey Gardens, Abbey Gardens Bury St Edmunds,...</td>\n",
       "      <td>[753599368361985, 227566504401689, 275566117]</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>BylandAbbey</th>\n",
       "      <td>[bylandabbey]</td>\n",
       "      <td>[Byland Abbey]</td>\n",
       "      <td>[272833761]</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                                                  tags  \\\n",
       "EH_name                                                                  \n",
       "AppuldurcombeHouse                 [appuldurcombehouse, appuldurcombe]   \n",
       "BelasKnapLongBarrow  [belasknap, belasknapp, belasknaplongbarrow, b...   \n",
       "BinhamPriory         [binhampriory, binhamprioryruins, binhampriory...   \n",
       "BuryStEdmundsAbbey   [burystedmundsabbeygarden, burystedmundsabbeyg...   \n",
       "BylandAbbey                                              [bylandabbey]   \n",
       "\n",
       "                                                             locations  \\\n",
       "EH_name                                                                  \n",
       "AppuldurcombeHouse                               [Appuldurcombe House]   \n",
       "BelasKnapLongBarrow               [Belas Knap Long Barrow, Belas Knap]   \n",
       "BinhamPriory                                           [Binham Priory]   \n",
       "BuryStEdmundsAbbey   [Abbey Gardens, Abbey Gardens Bury St Edmunds,...   \n",
       "BylandAbbey                                             [Byland Abbey]   \n",
       "\n",
       "                                                       locationids  \n",
       "EH_name                                                             \n",
       "AppuldurcombeHouse                                        [305532]  \n",
       "BelasKnapLongBarrow                   [497373440775663, 228550224]  \n",
       "BinhamPriory                                           [971275658]  \n",
       "BuryStEdmundsAbbey   [753599368361985, 227566504401689, 275566117]  \n",
       "BylandAbbey                                            [272833761]  "
      ]
     },
     "execution_count": 174,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "keyfreesite_df = pd.read_excel('EH_Instagram.xlsx',sheet_name='keyfreesite',usecols='B:Q',\n",
    "                    converters={'locationid1': np.int64, 'locationid2': np.int64,\n",
    "                          'locationid3': np.int64,'locationid4': np.int64,\n",
    "                          'locationid5': np.int64})\n",
    "keyfreesite_df['tags'] = keyfreesite_df[['tag1','tag2','tag3','tag4','tag5']].T.apply(lambda x: x.dropna().tolist())\n",
    "keyfreesite_df['locations'] = keyfreesite_df[['location1','location2','location3','location4','location5']].T.apply(lambda x: x.dropna().tolist())\n",
    "keyfreesite_df['locationids'] = keyfreesite_df[['locationid1','locationid2','locationid3','locationid4','locationid5']].T.apply(lambda x: x.dropna().tolist())\n",
    "keyfreesite_df = keyfreesite_df.drop(['tag1','tag2','tag3','tag4','tag5',\n",
    "                                     'location1','location2','location3','location4','location5',\n",
    "                                     'locationid1','locationid2','locationid3','locationid4','locationid5'],axis=1)\n",
    "keyfreesite_df = keyfreesite_df.set_index('EH_name')\n",
    "keyfreesite_df.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "The scripts below read each ``.csv`` file corresponding to a scraped hashtag or location page into a ``pandas DataFrame``, whose rows are posts and columns are each post's unique ID ([https://www.instagram.com/p/{postid}/]), Unix timestamp (which I convert into a timezone-aware ``datetime`` object), caption, user ID, number of likes, image height, and image width.\n",
    "\n",
    "On the initial run, entries for the following posts (identified by their unique ID [https://www.instagram.com/p/{postid}/]) in the ``.csv`` were misaligned (i.e. caption entries overflowed onto more than one row in the file; possibly related to use of the newline character \"\\n\" preceding a hashtag or following an emoji or special character).\n",
    "- ``castleriggstonecircle.csv``: qdsp5Lsscp\n",
    "- ``castlerigg.csv``: 86YRv5RjDC, qdsp5Lsscp\n",
    "- ``hadrianswall.csv``: BbUqWgel82l, BUt8UCCjmRN, BTVu93rjTXI, BONYw9wgfq8, BIwn3xvgPqp, BImts7TACtX, BEoVjPeE3-f, BEGkuD7LzAC, BC7ZRxNE34o, BCfh1M2E38s, BBsG3MSk3wW, BBmvDNek3wD, BBXRnonv_lP, 85AUZSLNk9, 6ZeoWrnZMl, 56wffPCJrF, 29NThlFvHY, otV_S3iJsE\n",
    "- ``reculvertowers.csv``: BIuoquwjmvu, 2_oLDNQ2xi\n",
    "- ``ruffordabbey.csv``: BfeMOtAA3GK, BO6bZ7MAT4t/, _1XTO8upge\n",
    "\n",
    "These raised ``TypeError``s when deriving ``time`` from ``unix`` timestamp. I thus make corrections into a namesake ``.xlsx``, then re-run the script."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 257,
   "metadata": {},
   "outputs": [],
   "source": [
    "# List all tags and location IDs:\n",
    "tags_list = [item for sublist in keyfreesite_df['tags'] for item in sublist]\n",
    "locs_list = [item for sublist in keyfreesite_df['locationids'] for item in sublist]\n",
    "\n",
    "# Create dictionaries to store DataFrames corresponding to each tag and location ID:\n",
    "tags_dict = dict.fromkeys(tags_list)\n",
    "locs_dict = dict.fromkeys(locs_list)\n",
    "\n",
    "# Create DataFrames to store the number of posts, and earliest and latest timestamps for each tag and location ID:\n",
    "tags_df = pd.DataFrame(tags_list,columns=['tag']).set_index('tag')\n",
    "tags_df = tags_df.assign(count=np.nan,earliest=np.nan,latest=np.nan) # creates new columns containing np.nan\n",
    "locs_df = pd.DataFrame(locs_list,columns=['tag']).set_index('tag')\n",
    "locs_df = locs_df.assign(count=np.nan,earliest=np.nan,latest=np.nan)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 258,
   "metadata": {},
   "outputs": [],
   "source": [
    "CSVCOLS = ['node.shortcode','node.taken_at_timestamp',\n",
    "           'node.edge_media_to_caption.edges.0.node.text',\n",
    "           'node.owner.id', 'node.edge_liked_by.count',\n",
    "           'node.dimensions.height','node.dimensions.width']\n",
    "RENAMEDCOLS = ['code','unix','caption','ownerid','likes','height','width']\n",
    "TIMEZONE = pytz.timezone(\"Europe/London\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 259,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "EmptyDataError: belasknapp.csv\n",
      "EmptyDataError: belasknaplongbarrow.csv\n",
      "EmptyDataError: belasknaplongbarrowgloucestershireengland.csv\n",
      "EmptyDataError: binhamprioryruins.csv\n",
      "EmptyDataError: binhampriorychurch.csv\n",
      "EmptyDataError: burystedmundsabbeygarden.csv\n",
      "EmptyDataError: burystedmundsabbeygardens.csv\n",
      "EmptyDataError: carneunyancientvillage.csv\n",
      "TypeError: castleriggstonecircle.csv\n",
      "TypeError: castlerigg.csv\n",
      "EmptyDataError: castleriggstones.csv\n",
      "EmptyDataError: castleriggstonecirle.csv\n",
      "EmptyDataError: castleriggstone.csv\n",
      "EmptyDataError: chichelecollege.csv\n",
      "EmptyDataError: cirencesteramphitheatre.csv\n",
      "TypeError: hadrianswall.csv\n",
      "EmptyDataError: longtowncastlewelshborder.csv\n",
      "EmptyDataError: maidencastledorset.csv\n",
      "EmptyDataError: minsterlovellhallruins.csv\n",
      "EmptyDataError: minsterlovellhallanddovecote.csv\n",
      "EmptyDataError: norhamcastleruins.csv\n",
      "EmptyDataError: oldoswestryhillfort.csv\n",
      "EmptyDataError: reculvertowersandromanfort.csv\n",
      "TypeError: reculvertowers.csv\n",
      "EmptyDataError: reculverfort.csv\n",
      "EmptyDataError: reculverromanfort.csv\n",
      "EmptyDataError: rollrightstonecircle.csv\n",
      "EmptyDataError: rollrightstone.csv\n",
      "EmptyDataError: rollrightstoneslivingsculpture.csv\n",
      "EmptyDataError: stalbanswall.csv\n",
      "TypeError: ruffordabbey.csv\n",
      "EmptyDataError: silchesterromanruins.csv\n",
      "EmptyDataError: silchesterromantownwall.csv\n",
      "EmptyDataError: silchesterromancity.csv\n",
      "EmptyDataError: stpaulsjarrow.csv\n",
      "EmptyDataError: stpaulsmonasteryjarrow.csv\n",
      "EmptyDataError: walltowncraggs.csv\n"
     ]
    }
   ],
   "source": [
    "for tag in tags_list:\n",
    "    try:\n",
    "        csv_df = pd.read_csv(\"{}.csv\".format(tag),usecols=CSVCOLS)[CSVCOLS]\n",
    "    except pd.io.common.EmptyDataError:\n",
    "        # .csv file is empty, i.e. no posts scraped\n",
    "        print(\"EmptyDataError: {}.csv\".format(tag))\n",
    "        continue\n",
    "    csv_df.columns = RENAMEDCOLS\n",
    "    try:\n",
    "        csv_df['time'] = csv_df.apply(lambda row: datetime.fromtimestamp(row['unix']).astimezone(TIMEZONE),axis=1)\n",
    "    except TypeError:\n",
    "        print(\"TypeError: {}.csv\".format(tag))\n",
    "        # Use the corrected .xlsx file instead\n",
    "        csv_df = pd.read_excel(\"{}.xlsx\".format(tag))[CSVCOLS]\n",
    "        csv_df.columns = RENAMEDCOLS\n",
    "        csv_df['time'] = csv_df.apply(lambda row: datetime.fromtimestamp(row['unix']).astimezone(TIMEZONE),axis=1)\n",
    "    tags_dict[tag] = csv_df\n",
    "    tags_df.loc[tag,'count'] = len(csv_df)\n",
    "    tags_df.loc[tag,'earliest'] = min(csv_df['time'])\n",
    "    tags_df.loc[tag,'latest'] = max(csv_df['time'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Run and correct .csv's into .xlsx's as required\n",
    "for loc in locs_list:\n",
    "    csv_df = pd.read_csv(\"{}.csv\".format(loc),usecols=CSVCOLS)[CSVCOLS]\n",
    "    csv_df.columns = ['code','unix','caption','ownerid','likes','height','width']\n",
    "    try:\n",
    "        csv_df = pd.read_csv(\"{}.csv\".format(loc),usecols=CSVCOLS)[CSVCOLS]\n",
    "    except pd.io.common.EmptyDataError:\n",
    "        # .csv file is empty, i.e. no posts scraped\n",
    "        print(\"EmptyDataError: {}.csv\".format(loc))\n",
    "        continue\n",
    "    csv_df.columns = RENAMEDCOLS\n",
    "    try:\n",
    "        csv_df['time'] = csv_df.apply(lambda row: datetime.fromtimestamp(row['unix']).astimezone(TIMEZONE),axis=1)\n",
    "    except TypeError:\n",
    "        print(\"TypeError: {}.csv\".format(loc))\n",
    "        # Use the corrected .xlsx file instead\n",
    "        csv_df = pd.read_excel(\"{}.xlsx\".format(loc))[CSVCOLS]\n",
    "        csv_df.columns = RENAMEDCOLS\n",
    "        csv_df['time'] = csv_df.apply(lambda row: datetime.fromtimestamp(row['unix']).astimezone(TIMEZONE),axis=1)\n",
    "    locs_dict[tag] = csv_df\n",
    "    locs_df.loc[tag,'count'] = len(csv_df)\n",
    "    locs_df.loc[tag,'earliest'] = min(csv_df['time'])\n",
    "    locs_df.loc[tag,'latest'] = max(csv_df['time'])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Sample outputs:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 263,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>code</th>\n",
       "      <th>unix</th>\n",
       "      <th>caption</th>\n",
       "      <th>ownerid</th>\n",
       "      <th>likes</th>\n",
       "      <th>height</th>\n",
       "      <th>width</th>\n",
       "      <th>time</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>BxFdWW0gpoP</td>\n",
       "      <td>1557068833</td>\n",
       "      <td>Appuldurcombe House📷🙌🏼\\r\\n.\\r\\n.\\r\\n.\\r\\n#appu...</td>\n",
       "      <td>20596825</td>\n",
       "      <td>22</td>\n",
       "      <td>1350</td>\n",
       "      <td>1080</td>\n",
       "      <td>2019-05-05 16:07:13+01:00</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>BxEw5M7gQEM</td>\n",
       "      <td>1557045526</td>\n",
       "      <td>Appuldurcombe House🌿☀️\\r\\n.\\r\\n.\\r\\nA beautifu...</td>\n",
       "      <td>20596825</td>\n",
       "      <td>21</td>\n",
       "      <td>1350</td>\n",
       "      <td>1080</td>\n",
       "      <td>2019-05-05 09:38:46+01:00</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>Bw-IZ9FHNF_</td>\n",
       "      <td>1556822972</td>\n",
       "      <td>This house is nicer than mine and it doesn't e...</td>\n",
       "      <td>627124561</td>\n",
       "      <td>16</td>\n",
       "      <td>1080</td>\n",
       "      <td>1080</td>\n",
       "      <td>2019-05-02 19:49:32+01:00</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>Bw4bOnPAjbo</td>\n",
       "      <td>1556631513</td>\n",
       "      <td>Large Candles, these candles are made from 100...</td>\n",
       "      <td>3677437313</td>\n",
       "      <td>14</td>\n",
       "      <td>1080</td>\n",
       "      <td>1080</td>\n",
       "      <td>2019-04-30 14:38:33+01:00</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>BwlW8vwBtJL</td>\n",
       "      <td>1555991736</td>\n",
       "      <td>Ah my new house, just needs a few minor adjust...</td>\n",
       "      <td>193939315</td>\n",
       "      <td>36</td>\n",
       "      <td>1080</td>\n",
       "      <td>1080</td>\n",
       "      <td>2019-04-23 04:55:36+01:00</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "          code        unix                                            caption  \\\n",
       "0  BxFdWW0gpoP  1557068833  Appuldurcombe House📷🙌🏼\\r\\n.\\r\\n.\\r\\n.\\r\\n#appu...   \n",
       "1  BxEw5M7gQEM  1557045526  Appuldurcombe House🌿☀️\\r\\n.\\r\\n.\\r\\nA beautifu...   \n",
       "2  Bw-IZ9FHNF_  1556822972  This house is nicer than mine and it doesn't e...   \n",
       "3  Bw4bOnPAjbo  1556631513  Large Candles, these candles are made from 100...   \n",
       "4  BwlW8vwBtJL  1555991736  Ah my new house, just needs a few minor adjust...   \n",
       "\n",
       "      ownerid  likes  height  width                      time  \n",
       "0    20596825     22    1350   1080 2019-05-05 16:07:13+01:00  \n",
       "1    20596825     21    1350   1080 2019-05-05 09:38:46+01:00  \n",
       "2   627124561     16    1080   1080 2019-05-02 19:49:32+01:00  \n",
       "3  3677437313     14    1080   1080 2019-04-30 14:38:33+01:00  \n",
       "4   193939315     36    1080   1080 2019-04-23 04:55:36+01:00  "
      ]
     },
     "execution_count": 263,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "tags_dict['appuldurcombehouse'].head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 262,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>count</th>\n",
       "      <th>earliest</th>\n",
       "      <th>latest</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>tag</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>appuldurcombehouse</th>\n",
       "      <td>432.0</td>\n",
       "      <td>2015-12-10 20:53:24+00:00</td>\n",
       "      <td>2019-05-05 16:07:13+01:00</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>appuldurcombe</th>\n",
       "      <td>288.0</td>\n",
       "      <td>2013-09-13 22:33:33+01:00</td>\n",
       "      <td>2019-05-05 16:07:13+01:00</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>belasknap</th>\n",
       "      <td>430.0</td>\n",
       "      <td>2015-03-28 17:55:33+00:00</td>\n",
       "      <td>2019-05-06 12:00:14+01:00</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>belasknapp</th>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>belasknaplongbarrow</th>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                     count                   earliest  \\\n",
       "tag                                                     \n",
       "appuldurcombehouse   432.0  2015-12-10 20:53:24+00:00   \n",
       "appuldurcombe        288.0  2013-09-13 22:33:33+01:00   \n",
       "belasknap            430.0  2015-03-28 17:55:33+00:00   \n",
       "belasknapp             NaN                        NaN   \n",
       "belasknaplongbarrow    NaN                        NaN   \n",
       "\n",
       "                                        latest  \n",
       "tag                                             \n",
       "appuldurcombehouse   2019-05-05 16:07:13+01:00  \n",
       "appuldurcombe        2019-05-05 16:07:13+01:00  \n",
       "belasknap            2019-05-06 12:00:14+01:00  \n",
       "belasknapp                                 NaN  \n",
       "belasknaplongbarrow                        NaN  "
      ]
     },
     "execution_count": 262,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "tags_df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Converting Unix timestamps to timezone-aware ``datetime`` objects\n",
    "Verifying that timezone conversion using ``pytz.timezone(\"Europe/London\")`` agrees with the date on Instagram:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 156,
   "metadata": {},
   "outputs": [],
   "source": [
    "TIMEZONE = pytz.timezone(\"Europe/London\")\n",
    "burystedmundsabbey = pd.read_csv(\"burystedmundsabbey.csv\",usecols=['node.shortcode','node.taken_at_timestamp'])\n",
    "burystedmundsabbey['t_naive'] = burystedmundsabbey.apply(lambda row: datetime.utcfromtimestamp(row['node.taken_at_timestamp']),axis=1)\n",
    "burystedmundsabbey['t_aware'] = burystedmundsabbey.apply(lambda row: datetime.fromtimestamp(row['node.taken_at_timestamp']).astimezone(TIMEZONE),axis=1)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "_Note_: According to the ``pytz`` [documentation](http://pytz.sourceforge.net/), using the ``tzinfo`` argument of the standard ``datetime`` constructors (e.g. ``datetime.fromtimestamp(row['node.taken_at_timestamp'],TIMEZONE)``) \"does not work\" with pytz for many timezones."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 157,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "node.shortcode                           BWdvwKwFs6z\n",
       "node.taken_at_timestamp                   1499901730\n",
       "t_naive                          2017-07-12 23:22:10\n",
       "t_aware                    2017-07-13 00:22:10+01:00\n",
       "Name: 198, dtype: object"
      ]
     },
     "execution_count": 157,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "burystedmundsabbey.loc[198]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Here, ``t_aware = 2017-07-13 00:22:10+01:00`` matches [the post's](https://www.instagram.com/p/BWdvwKwFs6z/) date of July 13, 2017."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 188,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Timestamp('2019-05-06 11:34:54+0100', tz='Europe/London')"
      ]
     },
     "execution_count": 188,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "max(burystedmundsabbey['t_aware'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 189,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Timestamp('2016-07-17 21:28:25+0100', tz='Europe/London')"
      ]
     },
     "execution_count": 189,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "min(burystedmundsabbey['t_aware'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.8"
  },
  "toc-autonumbering": false
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
